{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "vscode": {
     "languageId": "bat"
    }
   },
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/search/meeting-transcription-search/meeting_transcription_semantic_search.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Semantic Search over your Meeting audio data\n",
    "\n",
    "This notebook demonstrates how to quickly enable semantic search given a single audio file with Pinecone and Hugging Face. Don't have one handy? No problem, use\n",
    "the sample audio instead."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: datasets in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (3.1.0)\n",
      "Requirement already satisfied: transformers in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (4.37.2)\n",
      "Requirement already satisfied: pinecone in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (6.0.2)\n",
      "Requirement already satisfied: filelock in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (3.13.1)\n",
      "Requirement already satisfied: numpy>=1.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (1.26.4)\n",
      "Requirement already satisfied: pyarrow>=15.0.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (18.0.0)\n",
      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (0.3.8)\n",
      "Requirement already satisfied: pandas in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (2.2.2)\n",
      "Requirement already satisfied: requests>=2.32.2 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (2.32.3)\n",
      "Requirement already satisfied: tqdm>=4.66.3 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (4.67.1)\n",
      "Requirement already satisfied: xxhash in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (3.5.0)\n",
      "Requirement already satisfied: multiprocess<0.70.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (0.70.16)\n",
      "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.3.1)\n",
      "Requirement already satisfied: aiohttp in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (3.11.7)\n",
      "Requirement already satisfied: huggingface-hub>=0.23.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (0.26.2)\n",
      "Requirement already satisfied: packaging in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (23.2)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (6.0.1)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from transformers) (2023.10.3)\n",
      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from transformers) (0.15.1)\n",
      "Requirement already satisfied: safetensors>=0.4.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from transformers) (0.4.2)\n",
      "Requirement already satisfied: certifi>=2019.11.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (2024.2.2)\n",
      "Requirement already satisfied: pinecone-plugin-interface<0.0.8,>=0.0.7 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (0.0.7)\n",
      "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (2.9.0)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (4.12.2)\n",
      "Requirement already satisfied: urllib3>=1.26.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (2.1.0)\n",
      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (2.4.3)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (1.3.1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (24.2.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (1.5.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (6.1.0)\n",
      "Requirement already satisfied: propcache>=0.2.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (0.2.0)\n",
      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (1.18.0)\n",
      "Requirement already satisfied: six>=1.5 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from python-dateutil>=2.5.3->pinecone) (1.16.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from requests>=2.32.2->datasets) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from requests>=2.32.2->datasets) (3.7)\n",
      "Requirement already satisfied: pytz>=2020.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pandas->datasets) (2024.1)\n",
      "Requirement already satisfied: tzdata>=2022.7 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pandas->datasets) (2024.1)\n"
     ]
    }
   ],
   "source": [
    "## Installs\n",
    "!pip install datasets transformers pinecone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Grab your desired audio file compatible with Hugging Face Pipelines and put it here\n",
    "from getpass import getpass\n",
    "import os \n",
    "audio_path = \"\"\n",
    "transcription_result = []\n",
    "\n",
    "api_key =  os.environ.get('PINECONE_API_KEY')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a dataset or upload your own file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import pipeline\n",
    "\n",
    "pipeline = pipeline(\n",
    "    task=\"automatic-speech-recognition\",\n",
    "    model=\"openai/whisper-large-v3\",\n",
    ")\n",
    "\n",
    "\n",
    "if audio_path == \"\":\n",
    "    # use Hugging Face Sample Code instead, located here https://huggingface.co/learn/audio-course/en/chapter7/transcribe-meeting\n",
    "    concatenated_librispeech = load_dataset(\n",
    "    \"sanchit-gandhi/concatenated_librispeech\", split=\"train\")\n",
    "    transcription_result = pipeline(concatenated_librispeech[0][\"audio\"][\"array\"], return_timestamps=True)\n",
    "    transcription_result\n",
    "else:\n",
    "    # Use your own audio file, check out this for details: https://huggingface.co/openai/whisper-large-v3\n",
    "    transcription_result = pipeline(audio_path, return_timestamps=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'timestamp': (0.0, 15.1), 'text': ' the second in importance is as follows sovereignty may be defined to be the right of making laws in france the king really exercises a portion of the sovereign power since the laws have no weight'}, {'timestamp': (15.1, 21.72), 'text': \" he was in a fevered state of mind owing to the blight his wife's action threatened to cast upon his entire future\"}]\n"
     ]
    }
   ],
   "source": [
    "print(transcription_result[\"chunks\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert into records and upsert with Integrated Inference\n",
    "\n",
    "\n",
    "Integrated Inference from Pinecone lets you embed your records with a hosted embedding model, and upsert them into\n",
    "a Pinecone index at the same time! We've included some batching code in case your audio file is long."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "## use sentences as chunks, and transform into records for upsertion\n",
    "\n",
    "# Turn into records\n",
    "records = [\n",
    "    {\n",
    "        \"_id\": str(idx),\n",
    "        \"sentence\": chunk[\"text\"],\n",
    "        # add any other desired metadata here\n",
    "    }\n",
    "    for idx, chunk in enumerate(transcription_result[\"chunks\"])\n",
    "]\n",
    "\n",
    "# Import the Pinecone library\n",
    "from pinecone import Pinecone\n",
    "\n",
    "# Initialize a Pinecone client with your API key\n",
    "pc = Pinecone(api_key=api_key)\n",
    "namespace = \"meeting-1\"\n",
    "# Create a dense index with integrated embedding\n",
    "index_name = \"meeting-transcription-index\"\n",
    "if not pc.has_index(index_name):\n",
    "    pc.create_index_for_model(\n",
    "        name=index_name,\n",
    "        cloud=\"aws\",\n",
    "        region=\"us-east-1\",\n",
    "        embed={\n",
    "            \"model\":\"llama-text-embed-v2\",\n",
    "            \"field_map\":{\"text\": \"sentence\"}\n",
    "        }\n",
    "    )\n",
    "\n",
    "index = pc.Index(index_name)\n",
    "# query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# upsert into pinecone\n",
    "def batch_upsert(records, batch_size=96, namespace=namespace):\n",
    "    # Great for longer audio files and batches of sentences\n",
    "    for i in range(0, len(records), batch_size):\n",
    "        batch = records[i:i+batch_size]\n",
    "        index.upsert_records(namespace=namespace, records=batch)\n",
    "\n",
    "batch_upsert(records)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Query the index with integrated inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace with your own query here if needed\n",
    "import time\n",
    "query = \"Tell me about the king of France\"\n",
    "\n",
    "# Depending on the size of your dataset, it may take a few seconds for it to finish\n",
    "# embedding and populating into the index.\n",
    "time.sleep(10)\n",
    "\n",
    "results = index.search(\n",
    "    namespace=namespace,\n",
    "    query={\n",
    "        \"inputs\": {\"text\": query},\n",
    "        \"top_k\": 5,\n",
    "    },\n",
    ")\n",
    "\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cleanup\n",
    "\n",
    "#pc.delete_index(name=index_name)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pinecone-examples",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
